/*
Steps in setting up the QALY analysis

1) Import the Chambers dataset

2) Match to the Redbook

3) Extract the relevant claims

4) Identify the relevant experimental individuals

*/




// STEP 1
clear
import excel using "2021 02 24 - 1999-2015 raw data.xls", firstrow

// clean up the basic data
gen priority = (priority_or_standard == "Priority")

sort fda_approval_year drugname

keep fda_approval_year drugname active_ingredients primary_diseasename priority_or_standard approval_date comparator incremental_qaly
order fda_approval_year priority_or_standard incremental_qaly drugname active_ingredients comparator primary_diseasename
keep if fda_approval_year <= 2013

compress
replace comparator = upper(comparator)


export delimited using simple_qaly_data.csv, replace
// Then manually clean up the clean comparator info so that it matches the drug names => simple_qaly_data_w_additions.xlsx



clear
import excel using simple_qaly_data_w_additions, firstrow

egen entry_event = group(drugname approval_date)


collapse (mean) incremental_qaly (max) max_incremental_qaly = incremental_qaly (first) entry_event approval_date priority_or_standard, by(drugname clean_comparator primary_diseasename)

drop if clean_comparator == ""

unique entry_event

summ incremental_qaly, detail // do we have enough variation?


gen priority = (priority_or_standard == "Priority")
replace priority = . if priority == 0 & priority_or_standard != "Standard"
tab priority

regress incremental_qaly priority, cluster (entry_event)

save drug_entry_comparator_information, replace








// STEP 2

// get a raw list of drug names
clear
use drug_entry_comparator_information

keep drugname clean_comparator

rename clean_comparator drug
split drug, p(";")
drop drug
rename drugname drug0

gen id = _n
reshape long drug, i(id) j(seq)

replace drug = trim(drug)
drop if drug == ""


keep drug
duplicates drop

rename drug drugname

save list_of_drugs_chambers_v2, replace



clear 
use "redbook"

keep ndcnum gennme maintin maintds thergrp thrgrds thercls thrclds prodnme mstfmds

rename prodnme drugname

merge m:1 drugname using list_of_drugs_chambers_v2
keep if _merge == 1 | _merge == 3
gen fuzzy_candidates = (_merge == 1)
gen matched = (_merge == 3)
drop _merge

gen matched_name = drugname if matched == 1

gen fullname = drugname
split drugname, p(" ")
drop drugname
rename drugname1 drugname

merge m:1 drugname using list_of_drugs_chambers_v2
keep if _merge == 1 | _merge == 3
replace matched_name = drugname if _merge == 3 & matched != 1
drop _merge

replace matched = (matched_name != "")

// include generics so that we have a more complete long-term picture
replace gennme = drugname if gennme == "Inactive Record"
egen chemical_group = group(gennme)
egen max_matched = max(matched), by(chemical_group)
gen prob_generic = (max_matched == 1 & matched == 0)

// fill in matched drug name
gsort gennme - matched_name
replace matched_name = matched_name[_n-1] if matched_name == "" & gennme == gennme[_n-1]

keep if max_matched == 1

drop drugname*
rename fullname drugname

sort matched_name prob_generic ndcnum

// manual
drop if drugname == "VIAGRA" // wrong version


// keep main information
keep ndcnum matched_name prob_generic

egen matched_name_id = group(matched_name)

preserve
keep matched_name_id matched_name
duplicates drop
save matched_name_mapping, replace
restore

drop matched_name
save matched_chambers_redbook_info, replace




// STEP 3: create claims and chronic classification dataset

clear
set more off
use enrolid genind ndcnum svcdate using ccaed1996

// filter data: this has changed
merge m:1 ndcnum using "matched_chambers_redbook_info", keepus(matched_name_id prob_generic) keep(mat)
drop _merge

// start year necessary for generating first use indicator (relevant for reduced form)
merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
drop _merge



forvalues j=1997/2013 {
	append using ccaed`j', keep(enrolid genind ndcnum svcdate)
	
	drop matched_name_id prob_generic start_year
	
	merge m:1 ndcnum using "matched_chambers_redbook_info", keepus(matched_name_id prob_generic) keep(mat)
	drop _merge
	
	// filter data (keep size down)
	merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
	drop _merge
	
}


// generics: genind (type of situation and generic vs. brand)
destring genind, replace force
keep if genind >= 1 & genind <= 5

gen generic = (genind == 4) | (genind == 5)
gen generic_available = (genind == 3)


// tack on Redbook information to get ingredient name
// get things like: indic_id, generic name, extended release version (indication from Cortellis)
drop genind


///////////////
// END SETUP //
///////////////

// figure out when a user first starts using anything in the class
sort enrolid matched_name_id svcdate generic // go with brand if multiple of the same day
egen first_use = tag(enrolid matched_name_id)

// label new subscription (first and six months after the start of the sample)
gen raw_first_use = first_use
replace first_use = 0 if first_use == 1 & svcdate < mdy(6,1,start_year) // MODIFIED


// NEW: also create list of first use dates
gen year = year(svcdate)
gen quarter = qofd(svcdate)
tab quarter

gen prescriptions = 1
egen drugid = group(matched_name_id generic) // a drug is ingredient x generic x extended release

// Crude: get drug with most prescriptions in a given year
// 1 entry per enrollee, year, and indication
// additional variables: first year using drugs in some area (allows us to see if drug used in entry year)
gsort enrolid quarter matched_name_id - prescriptions - generic
collapse (sum) prescriptions (max) first_use (first) generic year, by(enrolid matched_name_id quarter)


save user_quarter_choice_chambers, replace 



// summary stats to figure out what's happening
//clear
//use user_quarter_choice_chambers
clear
use user_quarter_choice_chambers

egen t = tag(enrolid)
tab t // 380k people
drop t

egen t = tag(matched_name_id)
tab t // 115 drugs
drop t

tab first_use


collapse (sum) prescriptions (sum) first_use, by(matched_name_id enrolid year)
collapse (p50) prescriptions (sum) first_use, by(matched_name_id)

merge m:1 matched_name_id using matched_name_mapping
keep if _merge == 3
drop _merge 

//summ prescriptions if year == 2003, detail
//summ first_use, detail

//gsort year - prescriptions
rename prescriptions median_prescriptions
gsort - median_prescriptions
save median_prescriptions_stats_drug_level, replace // about 157/193 are chronic



// STEP 4: create pools of individuals involved in quasi-experiment



clear
use drug_entry_comparator_information

// keep first entry event for each drug
egen min_approval_date = min(approval_date), by(drugname)
keep if approval_date == min_approval_date

// aggregate to the pair level: 104 unique entering drugs out of 147 comparisons
collapse (mean) incremental_qaly (max) max_incremental_qaly priority ///
(first) approval_date primary_diseasename, by(drugname clean_comparator)

rename clean_comparator drug
split drug, p(";")
drop drug

gen id = _n
reshape long drug, i(id) j(seq)
drop if drug == ""

rename drug comparator
drop id seq

egen comparison_id = group(drugname comparator)

order comparison_id drugname comparator

rename drugname matched_name
merge m:1 matched_name using matched_name_mapping
keep if _merge == 3
drop _merge
rename (matched_name matched_name_id) (drugname drug_id)

rename comparator matched_name
merge m:1 matched_name using matched_name_mapping
keep if _merge == 3
drop _merge
rename (matched_name matched_name_id) (comparator comparator_id)

display _N // 169 pairs
save chambers_pairs, replace





/*
Create a drug level dataset to figure out which events they are a part of
*/
clear
use chambers_pairs

sort comparison_id

rename drugname drugname1
rename comparator drugname2

reshape long drugname, i(comparison_id) j(seq)

gen entering_drug = (seq == 1)

rename drugname matched_name
merge m:1 matched_name using matched_name_mapping
keep if _merge == 3
drop _merge

gen entry_quarter = qofd(approval_date)

keep matched_name_id entering_drug comparison_id entry_quarter

save chambers_drug_level_info, replace


// given resources, find users
clear
use user_quarter_choice_chambers


// 1) Identify when each user first uses each drug (excluding the "first_use == 0")
sort enrolid matched_name_id quarter 
egen t = tag(enrolid matched_name_id)
tab first_use if t == 1 // 80.2% true


keep if t == 1 & first_use == 1
keep enrolid matched_name_id quarter

save user_first_use_by_drug, replace


// 2) Find users eligible for 
clear
use user_first_use_by_drug

joinby matched_name_id using chambers_drug_level_info

// for each comparison, keep only the earliest "first date" within a comparison set
sort enrolid comparison_id quarter
egen t = tag(enrolid comparison_id)
keep if t == 1
drop t

// compute relative quarter of first use
gen rel_quarter = quarter - entry_quarter
keep if rel_quarter >= -3 & rel_quarter <= 4

keep enrolid comparison_id entry_quarter rel_quarter entering_drug // entering drug indicates whether the person used the entering drug first (within the set of comparators)

gen control = (rel_quarter <= 0)
gen treat = (rel_quarter > 0)

tab control
tab treat

// this part makes sense
summ entering_drug if control == 1
summ entering_drug if treat == 1

// tack on the relevant comparators
merge m:1 comparison_id using chambers_pairs, keepus(drug_id comparator_id)
keep if _merge == 3
drop _merge

save eligible_analysis_users, replace



